• create a table per each chart set
  • group queries by sub-type (sorting.. merging)
  • chanrts with two cols
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ---------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DT)
library(htmltools)

Setup

options = params$options
targetVar = params$targetVar
targetUnits = ifelse(targetVar=='Throughput','MB/s','secs')
showOutliers = params$showOutliers
outliersQuantile = ifelse(showOutliers, 1 ,0.90)  #use 1.0 to include all outliers
outliersShape = ifelse(showOutliers,  19,NA)  
plotCols=2

Reading Files

read.clean.files = function(filename){
  file = read.csv(filename, header = FALSE)
  column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken")
  colnames(file) = column.names
  return(file)
}

files = list.files(path = "../Results/", pattern = ".csv$", recursive = TRUE, full.names = TRUE) # List all .csv files
#files

databricks.files = files[grepl("Databricks",files)]
local.vm..files = files[grepl("Local_VM",files)]
cluster.files = files[grepl("Cluster",files)]

rows.databricks = lapply(databricks.files, read.csv, header = FALSE) # Read the files into list
merged.databricks = do.call(rbind, rows.databricks) # combine the data.frame
merged.databricks$Setup = 'Databricks'

rows.local.vm = lapply(local.vm..files, read.csv, header = FALSE) # Read the files into list
merged.local.vm = do.call(rbind, rows.local.vm) # combine the data.frame
merged.local.vm$Setup = 'Local VM'

rows.cluster = lapply(cluster.files, read.csv, header = FALSE) # Read the files into list
merged.cluster = do.call(rbind, rows.cluster) # combine the data.frame
merged.cluster$Setup = 'Cluster'


merged_data = rbind(merged.databricks,merged.local.vm,merged.cluster)
merged_data$Setup = as.factor(merged_data$Setup)

column.names = c("Language","Randomize","Dataset","MachineID","RunID","Type","Operation","TimeTaken","Setup")
colnames(merged_data) = column.names
merged_data$Type = as.factor(gsub(pattern = "Operations", replacement = "Operation", x = merged_data$Type))

# Convert columns to factors
merged_data$MachineID = as.factor(merged_data$MachineID)
merged_data$Randomize = as.factor(merged_data$Randomize)
merged_data$RunID = as.factor(merged_data$RunID)

merged_data$Dataset = sub("dataset_", "", merged_data$Dataset) 
merged_data$Dataset = sub("MB$", "", merged_data$Dataset) 
merged_data$Dataset = as.factor(merged_data$Dataset)

merged_data$Operation = trimws(as.character(merged_data$Operation),'both')
merged_data[merged_data$Operation =='Mathematical Operations on Columns',]$Operation ='Mathematical Operation on Columns'

merged_data$subType = ''
merged_data[grepl('^Sorting',merged_data$Operation),]$subType = 'Sorting'
merged_data[grepl('^Ranking',merged_data$Operation),]$subType = 'Ranking'
merged_data[grepl('^Split',merged_data$Operation),]$subType = 'Splitting'
merged_data[grepl('^GroupBy',merged_data$Operation),]$subType = 'Grouping'
merged_data[grepl(' Join ',merged_data$Operation),]$subType = 'Joining'
merged_data[grepl('^Merge',merged_data$Operation),]$subType = 'Merging'
merged_data[grepl('^Filter',merged_data$Operation),]$subType = 'Filtering'
merged_data[grepl('^Mathematical',merged_data$Operation),]$subType = 'Mathematics'
merged_data[grepl('^Pivot',merged_data$Operation),]$subType = 'Pivots'
merged_data[grepl('^Running|^Shift',merged_data$Operation),]$subType = 'Run/Shift'
merged_data[grepl('^Writing',merged_data$Operation),]$subType = 'Writing'
size_10MB =  11.4789848327637 # file.size("../../Data/Databricks/machine2/dataset_10MB.csv")/(1024*1024)
size_100MB = 115.640992164612 # file.size("../../Data/Databricks/machine2/dataset_100MB.csv")/(1024*1024) 
size_200MB = 229.8573  
size_300MB = 343.2709
size_500MB = 576.678165435791 # file.size("../../Data/Databricks/machine2/dataset_500MB.csv")/(1024*1024) 

print(paste("Actual Size of 10MB file (in MB)",size_10MB))
## [1] "Actual Size of 10MB file (in MB) 11.4789848327637"
print(paste("Actual Size of 100MB file (in MB)",size_100MB))
## [1] "Actual Size of 100MB file (in MB) 115.640992164612"
print(paste("Actual Size of 200MB file (in MB)",size_200MB))
## [1] "Actual Size of 200MB file (in MB) 229.8573"
print(paste("Actual Size of 300MB file (in MB)",size_300MB))
## [1] "Actual Size of 300MB file (in MB) 343.2709"
print(paste("Actual Size of 500MB file (in MB)",size_500MB))
## [1] "Actual Size of 500MB file (in MB) 576.678165435791"
size_info = data.frame(Dataset = c("10","100","200","300","500")
                       ,Size = c(size_10MB,size_100MB,size_200MB,size_300MB,size_500MB))
str(size_info)
## 'data.frame':    5 obs. of  2 variables:
##  $ Dataset: Factor w/ 5 levels "10","100","200",..: 1 2 3 4 5
##  $ Size   : num  11.5 115.6 229.9 343.3 576.7
merged_data = merged_data %>%
  merge(size_info,by='Dataset') %>%
  mutate(Throughput = Size/TimeTaken)

Setups = c(unique(as.character(merged_data$Setup)))
data_raw = merged_data
merged_data = merged_data %>% 
    filter(RunID != 1)

str(merged_data)
## 'data.frame':    7397 obs. of  12 variables:
##  $ Dataset   : Factor w/ 5 levels "10","100","200",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Language  : Factor w/ 2 levels "PySpark","Scala": 2 1 2 2 2 1 1 1 1 2 ...
##  $ Randomize : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ MachineID : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ RunID     : Factor w/ 6 levels "1","2","3","4",..: 5 6 5 5 5 6 6 6 6 4 ...
##  $ Type      : Factor w/ 4 levels "Aggregate Operation",..: 2 2 1 2 1 2 2 2 2 2 ...
##  $ Operation : chr  "Sorting Asc 1 column" "Sorting Desc 5 column" "Ranking by Group" "Split 1 Column into 5" ...
##  $ TimeTaken : num  1.88 4.36 1.88 1.04 12.38 ...
##  $ Setup     : Factor w/ 3 levels "Cluster","Databricks",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ subType   : chr  "Sorting" "Sorting" "Ranking" "Splitting" ...
##  $ Size      : num  11.5 11.5 11.5 11.5 11.5 ...
##  $ Throughput: num  6.093 2.635 6.103 11.08 0.927 ...
head(merged_data)
##   Dataset Language Randomize MachineID RunID                Type
## 1      10    Scala         1         1     5    Column Operation
## 2      10  PySpark         1         1     6    Column Operation
## 3      10    Scala         1         1     5 Aggregate Operation
## 4      10    Scala         1         1     5    Column Operation
## 5      10    Scala         1         1     5 Aggregate Operation
## 6      10  PySpark         1         1     6    Column Operation
##                   Operation TimeTaken      Setup   subType     Size
## 1      Sorting Asc 1 column  1.884000 Databricks   Sorting 11.47898
## 2     Sorting Desc 5 column  4.355605 Databricks   Sorting 11.47898
## 3          Ranking by Group  1.881000 Databricks   Ranking 11.47898
## 4     Split 1 Column into 5  1.036000 Databricks Splitting 11.47898
## 5         GroupBy 5 columns 12.381000 Databricks  Grouping 11.47898
## 6 Full Outer Join 5 Columns  9.544184 Databricks   Joining 11.47898
##   Throughput
## 1  6.0928794
## 2  2.6354514
## 3  6.1025969
## 4 11.0801012
## 5  0.9271452
## 6  1.2027204
summary(merged_data)
##  Dataset       Language    Randomize MachineID RunID   
##  10 :1740   PySpark:3737   1:7397    1:4397    1:   0  
##  100:1620   Scala  :3660             2:3000    2:1698  
##  200:1350                                      3:1671  
##  300:1337                                      4:1455  
##  500:1350                                      5:1293  
##                                                6:1280  
##                   Type       Operation           TimeTaken      
##  Aggregate Operation: 998   Length:7397        Min.   :  0.230  
##  Column Operation   :3654   Class :character   1st Qu.:  3.161  
##  Mixed Operation    : 749   Mode  :character   Median : 10.261  
##  Row Operation      :1996                      Mean   : 25.683  
##                                                3rd Qu.: 26.573  
##                                                Max.   :631.088  
##         Setup        subType               Size          Throughput     
##  Cluster   :1500   Length:7397        Min.   : 11.48   Min.   :  0.273  
##  Databricks:2972   Class :character   1st Qu.:115.64   1st Qu.:  5.916  
##  Local VM  :2925   Mode  :character   Median :229.86   Median : 13.489  
##                                       Mean   :237.27   Mean   : 25.278  
##                                       3rd Qu.:343.27   3rd Qu.: 26.895  
##                                       Max.   :576.68   Max.   :268.098
print(!(.Platform$GUI == "RStudio"))
## [1] TRUE

Common Functions

subchunkify_clear <- function() {
  files=dir(path='figure/',pattern='sub_chunk_',include.dirs=T,full.names = T)
  all(file.remove(files))
}

subchunkify <- function(g, fig_height=7, fig_width=5, enabled = !(.Platform$GUI == "RStudio")) {
  if (enabled==FALSE) return(plot(g))
  g_deparsed <- paste0(deparse(
    function() {g}
  ), collapse = '')
  
  sub_chunk <- paste0("
  `","``{r sub_chunk_", floor(runif(1) * 10000), ", fig.height=", fig_height, ", fig.width=", fig_width, ", echo=FALSE}",
  "\n(", 
    g_deparsed
    , ")()",
  "\n`","``
  ")
  
  cat(knitr::knit(text = knitr::knit_expand(text = sub_chunk), quiet = TRUE))
}


ggplot_colors = function(plot,strip_angle=0,...){
  plot +
  scale_color_manual(values=c("#ca0020","#0571b0"),breaks=c('PySpark','Scala'))+
  scale_fill_manual(values=c("#f4a582","#92c5de"),breaks=c('PySpark','Scala'))+
    theme_light() +
    ggplot2::theme(strip.text.y=element_text(angle=strip_angle)
                   ,...)
  }

Simple Linear Regression Model

using a simple LM we can identify the elements that contribute to the speed of the queries, we see Scala has a coeffiecnet of -7, meaning that it bring an overall benefits of 7 seconds, keeping constant the other variables.

subchunkify_clear()
## [1] TRUE
form=as.formula(paste0(targetVar,' ~ Dataset + Language  + Operation + Setup'))
model = lm(data=merged_data,formula=form)
summary(model)
## 
## Call:
## lm(formula = form, data = merged_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -89.659  -8.926  -0.922   7.598 191.520 
## 
## Coefficients:
##                                            Estimate Std. Error t value
## (Intercept)                                 49.8435     1.5592  31.967
## Dataset100                                  17.2561     0.7583  22.758
## Dataset200                                  19.5960     0.8131  24.099
## Dataset300                                  18.6564     0.8152  22.885
## Dataset500                                  19.1115     0.8131  23.504
## LanguageScala                               10.2988     0.5024  20.501
## OperationFilter Reg Ex 1                    15.3760     1.9341   7.950
## OperationFilter Reg Ex 2                    15.8812     1.9341   8.211
## OperationFull Outer Join 10 Columns        -37.2898     2.6739 -13.946
## OperationFull Outer Join 3 Columns         -32.3872     3.7101  -8.729
## OperationFull Outer Join 5 Columns         -39.3983     2.6739 -14.734
## OperationGroupBy 1 column                  -39.8436     1.9341 -20.601
## OperationGroupBy 10 columns                -45.3896     1.9321 -23.492
## OperationGroupBy 5 columns                 -43.2951     1.9341 -22.386
## OperationInner Join 10 Columns             -34.9459     2.6739 -13.069
## OperationInner Join 3 Columns              -31.1775     3.7101  -8.403
## OperationInner Join 5 Columns              -36.2929     2.6739 -13.573
## OperationLeft Outer Join 10 Columns        -35.4052     2.6739 -13.241
## OperationLeft Outer Join 3 Columns         -31.1768     3.7101  -8.403
## OperationLeft Outer Join 5 Columns         -37.9073     2.6739 -14.177
## OperationMathematical Operation on Columns  55.6608     1.9341  28.779
## OperationMerge 10 columns into 1           -28.7629     1.9341 -14.872
## OperationMerge 2 columns into 1            -26.8238     1.9321 -13.883
## OperationMerge 5 columns into 1            -27.7477     1.9321 -14.361
## OperationPivot 1 Rows and 1 Column         -11.1919     1.9321  -5.793
## OperationPivot 10 Rows and 1 Column        -33.2620     1.9321 -17.215
## OperationPivot 5 Rows and 1 Column         -29.5658     1.9341 -15.287
## OperationRanking by Group                  -33.1435     1.9321 -17.154
## OperationRunning Sum                         5.1520     1.9321   2.666
## OperationShift (Lag)                         8.3843     1.9321   4.339
## OperationSorting Asc 1 column              -39.3792     1.9321 -20.381
## OperationSorting Asc 10 column             -39.9519     1.9321 -20.678
## OperationSorting Asc 5 column              -39.5402     1.9341 -20.444
## OperationSorting Desc 1 column             -39.3327     1.9341 -20.337
## OperationSorting Desc 10 column            -39.8533     1.9341 -20.606
## OperationSorting Desc 5 column             -39.5264     1.9341 -20.437
## OperationSplit 1 Column into 10            -33.4359     1.9321 -17.305
## OperationSplit 1 Column into 5             -30.8942     1.9321 -15.990
## OperationWriting 100 new rows              -23.8030     1.9321 -12.320
## OperationWriting 1000 new rows             -24.2613     1.9341 -12.544
## OperationWriting 10000 new rows            -26.6119     1.9321 -13.773
## SetupDatabricks                            -22.5190     0.6836 -32.943
## SetupLocal VM                              -29.1892     0.6857 -42.570
##                                            Pr(>|t|)    
## (Intercept)                                 < 2e-16 ***
## Dataset100                                  < 2e-16 ***
## Dataset200                                  < 2e-16 ***
## Dataset300                                  < 2e-16 ***
## Dataset500                                  < 2e-16 ***
## LanguageScala                               < 2e-16 ***
## OperationFilter Reg Ex 1                   2.14e-15 ***
## OperationFilter Reg Ex 2                   2.56e-16 ***
## OperationFull Outer Join 10 Columns         < 2e-16 ***
## OperationFull Outer Join 3 Columns          < 2e-16 ***
## OperationFull Outer Join 5 Columns          < 2e-16 ***
## OperationGroupBy 1 column                   < 2e-16 ***
## OperationGroupBy 10 columns                 < 2e-16 ***
## OperationGroupBy 5 columns                  < 2e-16 ***
## OperationInner Join 10 Columns              < 2e-16 ***
## OperationInner Join 3 Columns               < 2e-16 ***
## OperationInner Join 5 Columns               < 2e-16 ***
## OperationLeft Outer Join 10 Columns         < 2e-16 ***
## OperationLeft Outer Join 3 Columns          < 2e-16 ***
## OperationLeft Outer Join 5 Columns          < 2e-16 ***
## OperationMathematical Operation on Columns  < 2e-16 ***
## OperationMerge 10 columns into 1            < 2e-16 ***
## OperationMerge 2 columns into 1             < 2e-16 ***
## OperationMerge 5 columns into 1             < 2e-16 ***
## OperationPivot 1 Rows and 1 Column         7.22e-09 ***
## OperationPivot 10 Rows and 1 Column         < 2e-16 ***
## OperationPivot 5 Rows and 1 Column          < 2e-16 ***
## OperationRanking by Group                   < 2e-16 ***
## OperationRunning Sum                        0.00768 ** 
## OperationShift (Lag)                       1.45e-05 ***
## OperationSorting Asc 1 column               < 2e-16 ***
## OperationSorting Asc 10 column              < 2e-16 ***
## OperationSorting Asc 5 column               < 2e-16 ***
## OperationSorting Desc 1 column              < 2e-16 ***
## OperationSorting Desc 10 column             < 2e-16 ***
## OperationSorting Desc 5 column              < 2e-16 ***
## OperationSplit 1 Column into 10             < 2e-16 ***
## OperationSplit 1 Column into 5              < 2e-16 ***
## OperationWriting 100 new rows               < 2e-16 ***
## OperationWriting 1000 new rows              < 2e-16 ***
## OperationWriting 10000 new rows             < 2e-16 ***
## SetupDatabricks                             < 2e-16 ***
## SetupLocal VM                               < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.58 on 7354 degrees of freedom
## Multiple R-squared:  0.6159, Adjusted R-squared:  0.6137 
## F-statistic: 280.8 on 42 and 7354 DF,  p-value: < 2.2e-16

Comparing of Environments

Comparing Local VM vs. Cloud similar in configuration, controlling by the type operation. The cloud machine is optimized for Scala and Spark, while the local machine has a standard installation of the softwars.

We notice the DataBricks environment benefit the aggregate operation, and that Databricks beenfits the scala encironemtn for the Row Operations. Anyway, other sitation shows a similar perfomance across lancuages and environment

filtered= merged_data 
p=ggplot(data=filtered, aes_string(x='Setup',y=targetVar,color='Language',fill='Language')) + 
    geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape = outliersShape) +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
   facet_wrap(~Type,scales='free',ncol=plotCols) 
  ggplot_colors(p,axis.text.y=element_text(size=7),legend.position='right')
## Warning: Removed 740 rows containing non-finite values (stat_boxplot).

Comparing different data sizes

Comparing the peromance of the differnt queries among different sizes of dataset. We notice that the increased size has an import impact on the time. PySpark is suffering more by the increased size, mainly for Row and Column Operations, while Scala language shows a better performance. We can see

plotDataSize=function(filtered,title){
  t = filtered
   p=ggplot(data=t, aes_string(x='Dataset',y=targetVar,color='Language',fill='Language')) + 
     geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape) +
     scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                        ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
     facet_wrap(~Type ,labeller=label_wrap_gen(width=100),scales='free_x',ncol=plotCols)  +
     ggplot2::ggtitle(title)
   
   ggplot_colors(p,strip_angle=0
                 ,axis.text.y=element_text(size=7)
                 ,axis.text.x=element_text(size=9))

}

All Setups

filtered= merged_data   

plotDataSize(filtered,'Comparing Datsets - All Environments')
## Warning: Removed 740 rows containing non-finite values (stat_boxplot).

By Setup

for (s in Setups) {
  message(s)
  filtered= merged_data %>% filter(Setup==s)
  plot(plotDataSize(filtered,paste0('Comparing Datsets - ',s)))

}
## Databricks
## Warning: Removed 298 rows containing non-finite values (stat_boxplot).
## Local VM

## Warning: Removed 293 rows containing non-finite values (stat_boxplot).
## Cluster

## Warning: Removed 150 rows containing non-finite values (stat_boxplot).

Comparing Query Types

As expected, the slowest queries are the Aggregate one,while the fastest ones are the Row Operations.Scala outperform PySpark on the row operations, while we don’t see a significant differnece on the other query types

filtered=merged_data  
filtered$Type = forcats::fct_reorder(filtered$Type,filtered[[targetVar]],.fun=median,.desc=F)
p=ggplot(data=filtered, aes_string(x='Type',y=targetVar,color='Language',fill='Language')) + 
  geom_boxplot(outlier.size = 0.1,size=0.1,outlier.shape=outliersShape)   +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
  facet_wrap(~Setup,ncol=3) +
  ggplot2::coord_flip() 

ggplot_colors(p,axis.text.x=element_text(size=8)
              ,panel.spacing.x=unit(10,'points')
              ,legend.position='bottom') 
## Warning: Removed 740 rows containing non-finite values (stat_boxplot).

Comparing Queries

#functions
plot_get_significant = function(filtered){
    w<-function(oper){wilcox.test(as.formula(paste0(targetVar,' ~ Language ')),alternative='two.sided'
                                  ,filtered,subset = Operation == oper)$p.value}
    ops = unique(filtered$Operation)
    test=sapply(ops,w)
    keep= ops[which(test < 0.025)]
    data2 = filtered[filtered$Operation %in% keep,]
    if(nrow(data2)==0) {message("No Significant Elements") ; return(NULL)}
    return(data2)
}
plot_operations_get = function(filtered,title='',subtitle='',ncol = plotCols){
  if(is.null(filtered)) return(ggplot(data=filtered)+geom_blank())
  plotData = filtered %>% mutate(Operation = factor(Operation))
  p=  ggplot(data=plotData, aes_string(x='Dataset',y=targetVar,color='Language')) + 
    ggplot2::stat_summary(aes(group=Language),fun.y=median,geom='line',size=.5 )+
    geom_jitter(alpha=0.5,size=.5,width=0.1)+
    scale_y_continuous(labels=scales::comma_format(accuracy=1,suffix=targetUnits)) +
    facet_wrap(~ Operation,scales='free',ncol=ncol,labeller = label_wrap_gen(width = round(100/ncol))) +
    #facet_grid(Operation~subType,scales='free',labeller = label_wrap_gen()) +
    ggplot2::ggtitle(title,subtitle=subtitle)
  
  ggplot_colors(p,legend.position = 'bottom'
                ,axis.text.x=element_text(size=8,angle=45,hjust=1)
                ,axis.text.y=element_text(size=8)
                ,strip.background = element_rect(fill='#636363')
                )
       
}
plot_operations = function(filtered,title,ncol=plotCols,autoSize = ~(.Platform$GUI == "RStudio")){
  
  
  
  for (st in sort(unique(filtered$subType))){
    data=filter(filtered,subType==st)
    p=plot_operations_get(data,title=paste0(title,' - ' , st),subtitle=' (All Operations) ')
    subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/plotCols)*3+.5, fig_width=plotCols*3) 
    
    data=plot_get_significant(filter(filtered,subType==st))
    if(!is.null(data))  {
      p=plot_operations_get(data,title=paste0(title,' - ' , st ),subtitle=' (Significant Operations) ')
      subchunkify(p, fig_height=ceiling(n_distinct(data$Operation)/plotCols)*3+.5, fig_width=plotCols*3) 
    }
  }
}

getTable = function(data){
    t=data %>% group_by(subType,Operation,Dataset,Language) %>%
      summarise_at(.vars=targetVar,.funs=vars(mean)) %>%
      ungroup() %>%
      spread(key=Language,value=targetVar) %>%
      mutate(delta = PySpark-Scala)
    knitr::kable(t,digits=2,col.names=c('Category','Operation','Dataset','PySpark (sec)','Scala (sec)','PySpark - Scala'),padding=0,format='markdown')
}

Comparing Row operations

Rows Operations - All Environments:

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Filtering Filter 10 19.27 22.92 -3.64
Filtering Filter 100 50.36 51.01 -0.65
Filtering Filter 200 52.92 62.15 -9.24
Filtering Filter 300 48.25 65.61 -17.36
Filtering Filter 500 57.79 62.15 -4.36
Filtering Filter Reg Ex 1 10 21.49 25.13 -3.63
Filtering Filter Reg Ex 1 100 68.48 67.66 0.82
Filtering Filter Reg Ex 1 200 78.96 74.62 4.34
Filtering Filter Reg Ex 1 300 68.20 82.86 -14.66
Filtering Filter Reg Ex 1 500 83.87 75.10 8.77
Filtering Filter Reg Ex 2 10 21.97 25.68 -3.71
Filtering Filter Reg Ex 2 100 69.72 66.23 3.49
Filtering Filter Reg Ex 2 200 77.99 79.03 -1.04
Filtering Filter Reg Ex 2 300 70.58 81.03 -10.45
Filtering Filter Reg Ex 2 500 81.77 77.49 4.28
Run/Shift Running Sum 10 9.13 26.26 -17.12
Run/Shift Running Sum 100 16.30 92.15 -75.85
Run/Shift Running Sum 200 14.43 116.36 -101.93
Run/Shift Running Sum 300 12.02 117.70 -105.68
Run/Shift Running Sum 500 10.30 129.19 -118.89
Run/Shift Shift (Lag) 10 9.28 26.43 -17.15
Run/Shift Shift (Lag) 100 16.10 101.13 -85.03
Run/Shift Shift (Lag) 200 13.32 129.59 -116.27
Run/Shift Shift (Lag) 300 11.97 127.05 -115.08
Run/Shift Shift (Lag) 500 11.00 130.30 -119.30
Writing Writing 100 new rows 10 12.27 18.44 -6.17
Writing Writing 100 new rows 100 24.24 43.73 -19.48
Writing Writing 100 new rows 200 21.47 38.30 -16.83
Writing Writing 100 new rows 300 18.38 32.15 -13.77
Writing Writing 100 new rows 500 17.80 27.52 -9.72
Writing Writing 1000 new rows 10 13.14 18.02 -4.88
Writing Writing 1000 new rows 100 24.42 41.41 -16.99
Writing Writing 1000 new rows 200 21.48 37.49 -16.01
Writing Writing 1000 new rows 300 17.25 30.80 -13.55
Writing Writing 1000 new rows 500 16.75 28.79 -12.04
Writing Writing 10000 new rows 10 8.78 13.52 -4.74
Writing Writing 10000 new rows 100 21.81 38.44 -16.63
Writing Writing 10000 new rows 200 18.05 33.03 -14.98
Writing Writing 10000 new rows 300 15.96 32.82 -16.86
Writing Writing 10000 new rows 500 15.22 28.58 -13.36

## No Significant Elements

Rows Operations - Setup: Databricks

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Filtering Filter 10 11.51 17.63 -6.12
Filtering Filter 100 53.18 49.69 3.49
Filtering Filter 200 55.69 68.63 -12.93
Filtering Filter 300 54.67 76.06 -21.39
Filtering Filter 500 67.35 63.36 3.99
Filtering Filter Reg Ex 1 10 12.19 19.05 -6.85
Filtering Filter Reg Ex 1 100 68.26 61.08 7.18
Filtering Filter Reg Ex 1 200 85.08 79.32 5.75
Filtering Filter Reg Ex 1 300 79.11 93.57 -14.46
Filtering Filter Reg Ex 1 500 101.99 80.04 21.95
Filtering Filter Reg Ex 2 10 13.48 18.23 -4.75
Filtering Filter Reg Ex 2 100 68.72 64.62 4.10
Filtering Filter Reg Ex 2 200 81.83 78.87 2.95
Filtering Filter Reg Ex 2 300 83.05 88.46 -5.42
Filtering Filter Reg Ex 2 500 96.51 80.21 16.30
Run/Shift Running Sum 10 5.35 18.67 -13.32
Run/Shift Running Sum 100 15.83 95.00 -79.17
Run/Shift Running Sum 200 14.21 118.52 -104.31
Run/Shift Running Sum 300 11.51 118.02 -106.51
Run/Shift Running Sum 500 9.70 98.83 -89.14
Run/Shift Shift (Lag) 10 5.53 20.33 -14.80
Run/Shift Shift (Lag) 100 15.30 92.75 -77.45
Run/Shift Shift (Lag) 200 11.57 119.28 -107.71
Run/Shift Shift (Lag) 300 10.62 125.22 -114.60
Run/Shift Shift (Lag) 500 10.38 110.26 -99.87
Writing Writing 100 new rows 10 7.93 13.09 -5.16
Writing Writing 100 new rows 100 24.88 45.05 -20.17
Writing Writing 100 new rows 200 22.27 30.82 -8.55
Writing Writing 100 new rows 300 17.20 23.80 -6.59
Writing Writing 100 new rows 500 15.87 23.61 -7.74
Writing Writing 1000 new rows 10 8.60 13.48 -4.88
Writing Writing 1000 new rows 100 24.80 40.75 -15.95
Writing Writing 1000 new rows 200 21.13 29.89 -8.77
Writing Writing 1000 new rows 300 16.93 25.35 -8.42
Writing Writing 1000 new rows 500 15.15 23.68 -8.53
Writing Writing 10000 new rows 10 6.11 10.67 -4.56
Writing Writing 10000 new rows 100 23.35 39.76 -16.40
Writing Writing 10000 new rows 200 16.02 22.83 -6.81
Writing Writing 10000 new rows 300 14.16 20.47 -6.31
Writing Writing 10000 new rows 500 13.54 19.58 -6.04

## No Significant Elements

Rows Operations - Setup: Local VM

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Filtering Filter 10 21.02 23.17 -2.15
Filtering Filter 100 30.70 32.28 -1.58
Filtering Filter 200 32.08 33.25 -1.17
Filtering Filter 300 20.36 30.21 -9.85
Filtering Filter 500 26.49 30.75 -4.26
Filtering Filter Reg Ex 1 10 22.81 25.65 -2.84
Filtering Filter Reg Ex 1 100 41.95 49.37 -7.42
Filtering Filter Reg Ex 1 200 42.05 36.45 5.60
Filtering Filter Reg Ex 1 300 20.97 38.87 -17.90
Filtering Filter Reg Ex 1 500 31.33 31.87 -0.53
Filtering Filter Reg Ex 2 10 23.26 27.20 -3.94
Filtering Filter Reg Ex 2 100 41.97 37.79 4.18
Filtering Filter Reg Ex 2 200 43.21 48.02 -4.81
Filtering Filter Reg Ex 2 300 22.49 39.11 -16.62
Filtering Filter Reg Ex 2 500 31.84 38.54 -6.70
Run/Shift Running Sum 10 10.76 28.61 -17.86
Run/Shift Running Sum 100 13.60 59.99 -46.39
Run/Shift Running Sum 200 10.06 75.79 -65.73
Run/Shift Running Sum 300 6.75 72.37 -65.61
Run/Shift Running Sum 500 5.75 110.91 -105.16
Run/Shift Shift (Lag) 10 10.64 26.48 -15.83
Run/Shift Shift (Lag) 100 13.43 75.41 -61.98
Run/Shift Shift (Lag) 200 9.78 100.40 -90.62
Run/Shift Shift (Lag) 300 7.52 87.25 -79.74
Run/Shift Shift (Lag) 500 5.69 102.97 -97.28
Writing Writing 100 new rows 10 12.40 19.56 -7.16
Writing Writing 100 new rows 100 16.86 24.83 -7.97
Writing Writing 100 new rows 200 13.37 30.54 -17.17
Writing Writing 100 new rows 300 10.65 21.70 -11.05
Writing Writing 100 new rows 500 11.25 16.19 -4.94
Writing Writing 1000 new rows 10 13.96 18.04 -4.08
Writing Writing 1000 new rows 100 17.62 25.95 -8.33
Writing Writing 1000 new rows 200 15.71 28.17 -12.47
Writing Writing 1000 new rows 300 9.01 19.73 -10.73
Writing Writing 1000 new rows 500 10.05 18.24 -8.18
Writing Writing 10000 new rows 10 8.35 12.35 -4.00
Writing Writing 10000 new rows 100 15.16 21.15 -5.99
Writing Writing 10000 new rows 200 12.81 27.22 -14.41
Writing Writing 10000 new rows 300 9.21 28.88 -19.66
Writing Writing 10000 new rows 500 9.45 20.65 -11.20

## No Significant Elements

Rows Operations - Setup: Cluster

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Filtering Filter 10 31.31 32.99 -1.67
Filtering Filter 100 84.01 91.11 -7.09
Filtering Filter 200 89.04 107.01 -17.97
Filtering Filter 300 92.50 115.49 -23.00
Filtering Filter 500 101.28 122.53 -21.25
Filtering Filter Reg Ex 1 10 37.45 36.23 1.22
Filtering Filter Reg Ex 1 100 121.98 117.41 4.57
Filtering Filter Reg Ex 1 200 140.55 141.56 -1.00
Filtering Filter Reg Ex 1 300 143.03 149.45 -6.42
Filtering Filter Reg Ex 1 500 152.68 151.66 1.02
Filtering Filter Reg Ex 2 10 36.36 37.53 -1.17
Filtering Filter Reg Ex 2 100 127.24 126.33 0.91
Filtering Filter Reg Ex 2 200 139.89 141.37 -1.48
Filtering Filter Reg Ex 2 300 144.30 149.99 -5.69
Filtering Filter Reg Ex 2 500 152.17 149.97 2.20
Run/Shift Running Sum 10 13.46 36.73 -23.27
Run/Shift Running Sum 100 22.65 150.75 -128.10
Run/Shift Running Sum 200 23.63 193.21 -169.58
Run/Shift Running Sum 300 23.58 207.74 -184.16
Run/Shift Running Sum 500 20.62 226.48 -205.87
Run/Shift Shift (Lag) 10 14.06 38.55 -24.49
Run/Shift Shift (Lag) 100 23.03 169.31 -146.29
Run/Shift Shift (Lag) 200 23.91 208.61 -184.70
Run/Shift Shift (Lag) 300 23.58 210.29 -186.72
Run/Shift Shift (Lag) 500 22.87 225.04 -202.17
Writing Writing 100 new rows 10 20.71 26.92 -6.21
Writing Writing 100 new rows 100 37.73 78.87 -41.14
Writing Writing 100 new rows 200 36.07 68.78 -32.71
Writing Writing 100 new rows 300 36.18 69.78 -33.60
Writing Writing 100 new rows 500 34.77 58.01 -23.24
Writing Writing 1000 new rows 10 20.60 27.09 -6.49
Writing Writing 1000 new rows 100 37.26 73.64 -36.38
Writing Writing 1000 new rows 200 33.73 71.30 -37.57
Writing Writing 1000 new rows 300 34.31 63.84 -29.53
Writing Writing 1000 new rows 500 33.33 60.10 -26.77
Writing Writing 10000 new rows 10 14.98 21.53 -6.56
Writing Writing 10000 new rows 100 32.02 70.39 -38.37
Writing Writing 10000 new rows 200 32.61 65.08 -32.47
Writing Writing 10000 new rows 300 33.06 65.40 -32.34
Writing Writing 10000 new rows 500 30.13 62.45 -32.32

## Warning in wilcox.test.default(x = c(37.3931337469713, 34.3061034620522, :
## cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = c(14.4125942386937, 13.3451278278002, :
## cannot compute exact p-value with ties

Comparing Columns operations

Columns Operations - All Environments:

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Joining Full Outer Join 10 Columns 10 2.06 2.67 -0.61
Joining Full Outer Join 10 Columns 100 8.26 11.53 -3.28
Joining Full Outer Join 5 Columns 10 2.33 3.13 -0.81
Joining Full Outer Join 5 Columns 100 4.20 6.31 -2.11
Joining Inner Join 10 Columns 10 6.00 7.18 -1.19
Joining Inner Join 10 Columns 100 8.81 11.95 -3.13
Joining Inner Join 5 Columns 10 5.57 6.25 -0.69
Joining Inner Join 5 Columns 100 7.14 9.45 -2.31
Joining Left Outer Join 10 Columns 10 5.64 6.59 -0.95
Joining Left Outer Join 10 Columns 100 8.17 11.73 -3.56
Joining Left Outer Join 5 Columns 10 5.33 6.69 -1.36
Joining Left Outer Join 5 Columns 100 4.25 5.65 -1.40
Mathematics Mathematical Operation on Columns 10 29.64 29.35 0.29
Mathematics Mathematical Operation on Columns 100 115.50 107.61 7.89
Mathematics Mathematical Operation on Columns 200 134.69 122.18 12.52
Mathematics Mathematical Operation on Columns 300 120.30 142.56 -22.26
Mathematics Mathematical Operation on Columns 500 121.49 126.37 -4.87
Merging Merge 10 columns into 1 10 10.72 15.12 -4.40
Merging Merge 10 columns into 1 100 20.63 33.84 -13.22
Merging Merge 10 columns into 1 200 16.40 29.93 -13.52
Merging Merge 10 columns into 1 300 14.21 27.80 -13.59
Merging Merge 10 columns into 1 500 12.07 23.88 -11.81
Merging Merge 2 columns into 1 10 11.34 15.93 -4.60
Merging Merge 2 columns into 1 100 21.95 36.24 -14.28
Merging Merge 2 columns into 1 200 19.30 31.15 -11.85
Merging Merge 2 columns into 1 300 16.84 29.77 -12.93
Merging Merge 2 columns into 1 500 15.18 26.39 -11.20
Merging Merge 5 columns into 1 10 10.72 16.14 -5.42
Merging Merge 5 columns into 1 100 20.75 34.68 -13.93
Merging Merge 5 columns into 1 200 17.85 30.85 -13.00
Merging Merge 5 columns into 1 300 15.48 28.68 -13.20
Merging Merge 5 columns into 1 500 14.01 25.71 -11.70
Sorting Sorting Asc 1 column 10 6.30 8.16 -1.87
Sorting Sorting Asc 1 column 100 15.49 20.85 -5.36
Sorting Sorting Asc 1 column 200 8.15 10.40 -2.25
Sorting Sorting Asc 1 column 300 6.75 8.34 -1.59
Sorting Sorting Asc 1 column 500 5.96 8.14 -2.18
Sorting Sorting Asc 10 column 10 5.25 6.51 -1.26
Sorting Sorting Asc 10 column 100 14.67 19.13 -4.45
Sorting Sorting Asc 10 column 200 7.49 10.24 -2.75
Sorting Sorting Asc 10 column 300 6.43 8.27 -1.84
Sorting Sorting Asc 10 column 500 6.17 8.67 -2.51
Sorting Sorting Asc 5 column 10 6.20 7.68 -1.49
Sorting Sorting Asc 5 column 100 15.32 20.34 -5.02
Sorting Sorting Asc 5 column 200 8.72 10.54 -1.82
Sorting Sorting Asc 5 column 300 6.12 7.64 -1.52
Sorting Sorting Asc 5 column 500 6.39 7.98 -1.59
Sorting Sorting Desc 1 column 10 6.41 8.37 -1.96
Sorting Sorting Desc 1 column 100 15.47 20.09 -4.62
Sorting Sorting Desc 1 column 200 7.46 11.38 -3.92
Sorting Sorting Desc 1 column 300 6.64 9.04 -2.40
Sorting Sorting Desc 1 column 500 5.35 8.81 -3.46
Sorting Sorting Desc 10 column 10 5.61 7.31 -1.70
Sorting Sorting Desc 10 column 100 14.74 19.17 -4.43
Sorting Sorting Desc 10 column 200 7.56 9.35 -1.79
Sorting Sorting Desc 10 column 300 6.24 9.37 -3.13
Sorting Sorting Desc 10 column 500 6.28 8.16 -1.88
Sorting Sorting Desc 5 column 10 6.08 7.37 -1.29
Sorting Sorting Desc 5 column 100 15.06 19.34 -4.27
Sorting Sorting Desc 5 column 200 8.07 9.84 -1.76
Sorting Sorting Desc 5 column 300 6.38 9.36 -2.98
Sorting Sorting Desc 5 column 500 6.22 9.35 -3.13
Splitting Split 1 Column into 10 10 6.52 9.39 -2.87
Splitting Split 1 Column into 10 100 14.69 25.61 -10.93
Splitting Split 1 Column into 10 200 13.67 23.19 -9.52
Splitting Split 1 Column into 10 300 11.62 22.26 -10.64
Splitting Split 1 Column into 10 500 11.29 19.74 -8.45
Splitting Split 1 Column into 5 10 7.88 11.25 -3.36
Splitting Split 1 Column into 5 100 16.73 27.86 -11.13
Splitting Split 1 Column into 5 200 14.81 28.49 -13.68
Splitting Split 1 Column into 5 300 13.86 26.04 -12.17
Splitting Split 1 Column into 5 500 13.22 23.25 -10.03

## No Significant Elements

Columns Operations - Setup:Databricks

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Joining Full Outer Join 10 Columns 10 1.60 2.12 -0.51
Joining Full Outer Join 10 Columns 100 7.32 8.10 -0.78
Joining Full Outer Join 5 Columns 10 1.64 2.63 -0.99
Joining Full Outer Join 5 Columns 100 2.17 3.95 -1.78
Joining Inner Join 10 Columns 10 3.42 5.61 -2.19
Joining Inner Join 10 Columns 100 7.93 9.15 -1.22
Joining Inner Join 5 Columns 10 3.55 5.13 -1.58
Joining Inner Join 5 Columns 100 6.36 7.32 -0.96
Joining Left Outer Join 10 Columns 10 3.55 5.24 -1.69
Joining Left Outer Join 10 Columns 100 7.24 8.34 -1.10
Joining Left Outer Join 5 Columns 10 3.08 5.38 -2.29
Joining Left Outer Join 5 Columns 100 2.08 2.96 -0.88
Mathematics Mathematical Operation on Columns 10 19.70 22.65 -2.95
Mathematics Mathematical Operation on Columns 100 114.39 103.84 10.55
Mathematics Mathematical Operation on Columns 200 148.64 114.14 34.50
Mathematics Mathematical Operation on Columns 300 131.89 139.29 -7.40
Mathematics Mathematical Operation on Columns 500 124.85 111.49 13.36
Merging Merge 10 columns into 1 10 7.52 11.82 -4.30
Merging Merge 10 columns into 1 100 22.09 34.39 -12.30
Merging Merge 10 columns into 1 200 15.40 24.47 -9.07
Merging Merge 10 columns into 1 300 13.05 21.64 -8.60
Merging Merge 10 columns into 1 500 13.72 16.96 -3.24
Merging Merge 2 columns into 1 10 7.18 13.35 -6.17
Merging Merge 2 columns into 1 100 22.57 36.28 -13.71
Merging Merge 2 columns into 1 200 18.55 23.31 -4.76
Merging Merge 2 columns into 1 300 15.42 23.72 -8.30
Merging Merge 2 columns into 1 500 14.97 21.23 -6.27
Merging Merge 5 columns into 1 10 7.38 12.92 -5.54
Merging Merge 5 columns into 1 100 21.46 35.06 -13.60
Merging Merge 5 columns into 1 200 17.51 23.94 -6.43
Merging Merge 5 columns into 1 300 14.34 22.68 -8.34
Merging Merge 5 columns into 1 500 12.93 19.60 -6.67
Sorting Sorting Asc 1 column 10 4.03 6.27 -2.24
Sorting Sorting Asc 1 column 100 15.42 18.59 -3.17
Sorting Sorting Asc 1 column 200 5.99 4.71 1.27
Sorting Sorting Asc 1 column 300 4.31 2.83 1.48
Sorting Sorting Asc 1 column 500 3.86 3.62 0.24
Sorting Sorting Asc 10 column 10 3.61 5.32 -1.71
Sorting Sorting Asc 10 column 100 14.24 18.57 -4.33
Sorting Sorting Asc 10 column 200 4.47 5.49 -1.02
Sorting Sorting Asc 10 column 300 3.83 2.80 1.04
Sorting Sorting Asc 10 column 500 3.71 3.44 0.26
Sorting Sorting Asc 5 column 10 4.16 5.65 -1.48
Sorting Sorting Asc 5 column 100 15.49 19.07 -3.58
Sorting Sorting Asc 5 column 200 5.20 5.52 -0.32
Sorting Sorting Asc 5 column 300 3.77 3.12 0.65
Sorting Sorting Asc 5 column 500 3.76 3.87 -0.11
Sorting Sorting Desc 1 column 10 4.18 5.63 -1.45
Sorting Sorting Desc 1 column 100 16.39 18.67 -2.29
Sorting Sorting Desc 1 column 200 4.62 6.12 -1.51
Sorting Sorting Desc 1 column 300 3.30 3.22 0.08
Sorting Sorting Desc 1 column 500 3.59 4.06 -0.47
Sorting Sorting Desc 10 column 10 3.59 5.53 -1.94
Sorting Sorting Desc 10 column 100 14.31 17.78 -3.47
Sorting Sorting Desc 10 column 200 4.44 3.95 0.49
Sorting Sorting Desc 10 column 300 3.60 3.65 -0.05
Sorting Sorting Desc 10 column 500 3.78 3.85 -0.07
Sorting Sorting Desc 5 column 10 3.80 5.57 -1.77
Sorting Sorting Desc 5 column 100 15.06 18.25 -3.19
Sorting Sorting Desc 5 column 200 4.37 3.50 0.86
Sorting Sorting Desc 5 column 300 3.66 3.04 0.62
Sorting Sorting Desc 5 column 500 3.36 3.78 -0.41
Splitting Split 1 Column into 10 10 5.61 9.22 -3.61
Splitting Split 1 Column into 10 100 18.40 28.60 -10.20
Splitting Split 1 Column into 10 200 16.25 21.32 -5.07
Splitting Split 1 Column into 10 300 12.15 19.84 -7.69
Splitting Split 1 Column into 10 500 11.74 16.25 -4.51
Splitting Split 1 Column into 5 10 6.45 10.41 -3.96
Splitting Split 1 Column into 5 100 20.23 30.66 -10.43
Splitting Split 1 Column into 5 200 15.96 24.03 -8.07
Splitting Split 1 Column into 5 300 13.94 21.04 -7.10
Splitting Split 1 Column into 5 500 12.98 19.26 -6.28

## Warning in wilcox.test.default(x = c(2.58357031716984, 1.37888996360214, :
## cannot compute exact p-value with ties

## No Significant Elements

## No Significant Elements

Columns Operations - Setup:Local VM

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Joining Full Outer Join 10 Columns 10 1.19 1.41 -0.22
Joining Full Outer Join 10 Columns 100 5.11 5.89 -0.77
Joining Full Outer Join 5 Columns 10 1.32 1.80 -0.49
Joining Full Outer Join 5 Columns 100 4.27 5.15 -0.88
Joining Inner Join 10 Columns 10 6.93 6.19 0.74
Joining Inner Join 10 Columns 100 5.37 7.27 -1.90
Joining Inner Join 5 Columns 10 5.73 5.55 0.18
Joining Inner Join 5 Columns 100 4.83 4.76 0.07
Joining Left Outer Join 10 Columns 10 5.96 5.79 0.17
Joining Left Outer Join 10 Columns 100 5.16 5.95 -0.79
Joining Left Outer Join 5 Columns 10 5.56 5.51 0.05
Joining Left Outer Join 5 Columns 100 4.41 5.54 -1.13
Mathematics Mathematical Operation on Columns 10 33.32 31.63 1.69
Mathematics Mathematical Operation on Columns 100 83.60 79.70 3.90
Mathematics Mathematical Operation on Columns 200 86.13 84.61 1.52
Mathematics Mathematical Operation on Columns 300 51.49 107.36 -55.87
Mathematics Mathematical Operation on Columns 500 56.93 87.55 -30.62
Merging Merge 10 columns into 1 10 10.53 14.90 -4.36
Merging Merge 10 columns into 1 100 13.99 21.19 -7.20
Merging Merge 10 columns into 1 200 11.17 22.06 -10.89
Merging Merge 10 columns into 1 300 7.75 22.86 -15.11
Merging Merge 10 columns into 1 500 7.31 19.63 -12.32
Merging Merge 2 columns into 1 10 11.63 14.55 -2.92
Merging Merge 2 columns into 1 100 14.94 23.00 -8.06
Merging Merge 2 columns into 1 200 13.07 23.60 -10.53
Merging Merge 2 columns into 1 300 9.88 20.82 -10.94
Merging Merge 2 columns into 1 500 8.22 17.24 -9.02
Merging Merge 5 columns into 1 10 10.32 15.24 -4.91
Merging Merge 5 columns into 1 100 14.25 22.33 -8.07
Merging Merge 5 columns into 1 200 11.74 23.80 -12.05
Merging Merge 5 columns into 1 300 9.70 20.63 -10.93
Merging Merge 5 columns into 1 500 8.82 18.44 -9.61
Sorting Sorting Asc 1 column 10 5.72 6.99 -1.27
Sorting Sorting Asc 1 column 100 9.68 12.80 -3.13
Sorting Sorting Asc 1 column 200 7.94 12.76 -4.81
Sorting Sorting Asc 1 column 300 5.82 10.06 -4.24
Sorting Sorting Asc 1 column 500 6.22 8.70 -2.48
Sorting Sorting Asc 10 column 10 4.21 5.09 -0.88
Sorting Sorting Asc 10 column 100 9.03 10.58 -1.55
Sorting Sorting Asc 10 column 200 7.99 11.98 -3.99
Sorting Sorting Asc 10 column 300 5.72 9.60 -3.88
Sorting Sorting Asc 10 column 500 7.01 10.14 -3.13
Sorting Sorting Asc 5 column 10 5.07 6.62 -1.55
Sorting Sorting Asc 5 column 100 9.56 11.11 -1.55
Sorting Sorting Asc 5 column 200 10.13 13.13 -3.00
Sorting Sorting Asc 5 column 300 4.54 8.60 -4.05
Sorting Sorting Asc 5 column 500 6.94 8.08 -1.13
Sorting Sorting Desc 1 column 10 5.57 8.06 -2.49
Sorting Sorting Desc 1 column 100 8.82 9.92 -1.10
Sorting Sorting Desc 1 column 200 7.61 13.82 -6.20
Sorting Sorting Desc 1 column 300 6.31 11.88 -5.57
Sorting Sorting Desc 1 column 500 5.32 9.82 -4.50
Sorting Sorting Desc 10 column 10 4.45 6.25 -1.80
Sorting Sorting Desc 10 column 100 9.24 9.78 -0.54
Sorting Sorting Desc 10 column 200 8.21 11.56 -3.36
Sorting Sorting Desc 10 column 300 5.22 11.86 -6.65
Sorting Sorting Desc 10 column 500 6.79 8.61 -1.82
Sorting Sorting Desc 5 column 10 5.17 6.65 -1.48
Sorting Sorting Desc 5 column 100 9.85 9.87 -0.02
Sorting Sorting Desc 5 column 200 9.52 12.98 -3.46
Sorting Sorting Desc 5 column 300 5.46 12.27 -6.81
Sorting Sorting Desc 5 column 500 7.17 11.57 -4.40
Splitting Split 1 Column into 10 10 4.07 5.97 -1.90
Splitting Split 1 Column into 10 100 4.44 10.26 -5.82
Splitting Split 1 Column into 10 200 4.32 12.36 -8.03
Splitting Split 1 Column into 10 300 3.19 11.91 -8.72
Splitting Split 1 Column into 10 500 3.13 9.48 -6.35
Splitting Split 1 Column into 5 10 5.51 8.40 -2.89
Splitting Split 1 Column into 5 100 6.48 12.71 -6.23
Splitting Split 1 Column into 5 200 6.03 21.34 -15.31
Splitting Split 1 Column into 5 300 4.94 17.74 -12.80
Splitting Split 1 Column into 5 500 4.97 14.80 -9.82

## No Significant Elements

## No Significant Elements

Columns Operations - Setup:Cluster

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Joining Full Outer Join 10 Columns 10 4.70 5.02 -0.32
Joining Full Outer Join 10 Columns 100 16.41 24.04 -7.63
Joining Full Outer Join 5 Columns 10 5.73 5.47 0.26
Joining Full Outer Join 5 Columns 100 8.12 12.20 -4.08
Joining Inner Join 10 Columns 10 9.28 11.31 -2.04
Joining Inner Join 10 Columns 100 17.47 22.21 -4.74
Joining Inner Join 5 Columns 10 9.27 9.20 0.07
Joining Inner Join 5 Columns 100 13.30 18.40 -5.10
Joining Left Outer Join 10 Columns 10 9.17 10.07 -0.90
Joining Left Outer Join 10 Columns 100 16.08 24.28 -8.20
Joining Left Outer Join 5 Columns 10 9.35 10.49 -1.14
Joining Left Outer Join 5 Columns 100 8.28 11.14 -2.87
Mathematics Mathematical Operation on Columns 10 42.17 38.19 3.98
Mathematics Mathematical Operation on Columns 100 181.53 170.97 10.56
Mathematics Mathematical Operation on Columns 200 203.93 213.38 -9.45
Mathematics Mathematical Operation on Columns 300 237.07 219.48 17.59
Mathematics Mathematical Operation on Columns 500 243.91 233.76 10.15
Merging Merge 10 columns into 1 10 17.48 22.18 -4.70
Merging Merge 10 columns into 1 100 30.96 58.05 -27.09
Merging Merge 10 columns into 1 200 28.88 56.57 -27.69
Merging Merge 10 columns into 1 300 29.21 50.00 -20.79
Merging Merge 10 columns into 1 500 18.27 46.19 -27.92
Merging Merge 2 columns into 1 10 19.07 23.86 -4.79
Merging Merge 2 columns into 1 100 34.76 62.62 -27.86
Merging Merge 2 columns into 1 200 33.25 61.95 -28.69
Merging Merge 2 columns into 1 300 33.62 59.77 -26.15
Merging Merge 2 columns into 1 500 29.55 55.00 -25.45
Merging Merge 5 columns into 1 10 18.17 24.38 -6.21
Merging Merge 5 columns into 1 100 32.32 58.62 -26.30
Merging Merge 5 columns into 1 200 30.74 58.76 -28.02
Merging Merge 5 columns into 1 300 29.32 56.77 -27.45
Merging Merge 5 columns into 1 500 26.54 52.47 -25.93
Sorting Sorting Asc 1 column 10 11.99 14.31 -2.32
Sorting Sorting Asc 1 column 100 27.27 41.46 -14.19
Sorting Sorting Asc 1 column 200 12.90 17.06 -4.16
Sorting Sorting Asc 1 column 300 13.47 15.93 -2.46
Sorting Sorting Asc 1 column 500 9.66 16.06 -6.41
Sorting Sorting Asc 10 column 10 10.60 11.73 -1.14
Sorting Sorting Asc 10 column 100 26.82 37.33 -10.51
Sorting Sorting Asc 10 column 200 12.53 16.26 -3.73
Sorting Sorting Asc 10 column 300 13.04 16.53 -3.50
Sorting Sorting Asc 10 column 500 9.40 16.20 -6.80
Sorting Sorting Asc 5 column 10 12.52 13.89 -1.38
Sorting Sorting Asc 5 column 100 26.48 41.35 -14.87
Sorting Sorting Asc 5 column 200 12.93 15.40 -2.48
Sorting Sorting Asc 5 column 300 13.49 14.76 -1.27
Sorting Sorting Asc 5 column 500 10.53 16.00 -5.47
Sorting Sorting Desc 1 column 10 12.53 14.45 -1.92
Sorting Sorting Desc 1 column 100 26.93 43.23 -16.30
Sorting Sorting Desc 1 column 200 12.85 17.05 -4.20
Sorting Sorting Desc 1 column 300 13.29 15.00 -1.71
Sorting Sorting Desc 1 column 500 8.92 16.30 -7.38
Sorting Sorting Desc 10 column 10 11.98 13.01 -1.03
Sorting Sorting Desc 10 column 100 26.63 40.75 -14.12
Sorting Sorting Desc 10 column 200 12.52 15.72 -3.20
Sorting Sorting Desc 10 column 300 13.05 15.84 -2.79
Sorting Sorting Desc 10 column 500 10.25 15.88 -5.63
Sorting Sorting Desc 5 column 10 12.46 12.42 0.05
Sorting Sorting Desc 5 column 100 25.50 40.45 -14.96
Sorting Sorting Desc 5 column 200 12.59 16.21 -3.62
Sorting Sorting Desc 5 column 300 13.13 16.18 -3.05
Sorting Sorting Desc 5 column 500 10.02 16.04 -6.02
Splitting Split 1 Column into 10 10 13.23 16.59 -3.36
Splitting Split 1 Column into 10 100 27.76 50.35 -22.58
Splitting Split 1 Column into 10 200 27.19 48.57 -21.38
Splitting Split 1 Column into 10 300 27.43 47.79 -20.36
Splitting Split 1 Column into 10 500 26.69 47.23 -20.54
Splitting Split 1 Column into 5 10 15.49 18.61 -3.12
Splitting Split 1 Column into 5 100 30.23 52.56 -22.34
Splitting Split 1 Column into 5 200 30.10 51.75 -21.65
Splitting Split 1 Column into 5 300 31.56 52.63 -21.07
Splitting Split 1 Column into 5 500 30.19 48.15 -17.96

## Warning in wilcox.test.default(x = c(43.9746927716948, 43.3457651552285, :
## cannot compute exact p-value with ties
## No Significant Elements

Comparing Aggregate operations

Scala is faster than PySpark

Aggregate Operations - All Environments:

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Grouping GroupBy 1 column 10 1.30 1.56 -0.27
Grouping GroupBy 1 column 100 7.97 8.65 -0.68
Grouping GroupBy 1 column 200 11.04 11.45 -0.40
Grouping GroupBy 1 column 300 11.30 13.10 -1.80
Grouping GroupBy 1 column 500 14.32 13.41 0.91
Grouping GroupBy 10 columns 10 0.87 1.08 -0.21
Grouping GroupBy 10 columns 100 3.40 4.63 -1.23
Grouping GroupBy 10 columns 200 4.17 6.40 -2.23
Grouping GroupBy 10 columns 300 4.15 6.85 -2.70
Grouping GroupBy 10 columns 500 3.06 3.84 -0.78
Grouping GroupBy 5 columns 10 0.92 1.10 -0.18
Grouping GroupBy 5 columns 100 4.51 5.43 -0.93
Grouping GroupBy 5 columns 200 6.14 7.72 -1.59
Grouping GroupBy 5 columns 300 6.67 8.83 -2.15
Grouping GroupBy 5 columns 500 8.81 9.42 -0.61
Ranking Ranking by Group 10 5.79 7.01 -1.22
Ranking Ranking by Group 100 16.66 24.20 -7.54
Ranking Ranking by Group 200 15.42 22.58 -7.16
Ranking Ranking by Group 300 13.88 21.77 -7.89
Ranking Ranking by Group 500 13.40 20.19 -6.79

Aggregate Operations - Setup: Databricks

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Grouping GroupBy 1 column 10 0.73 1.08 -0.35
Grouping GroupBy 1 column 100 8.12 7.39 0.73
Grouping GroupBy 1 column 200 12.03 11.49 0.54
Grouping GroupBy 1 column 300 12.30 13.54 -1.24
Grouping GroupBy 1 column 500 17.27 13.56 3.70
Grouping GroupBy 10 columns 10 0.58 0.85 -0.27
Grouping GroupBy 10 columns 100 3.55 4.07 -0.52
Grouping GroupBy 10 columns 200 4.52 5.95 -1.43
Grouping GroupBy 10 columns 300 4.58 6.66 -2.08
Grouping GroupBy 10 columns 500 3.16 2.83 0.32
Grouping GroupBy 5 columns 10 0.62 0.83 -0.21
Grouping GroupBy 5 columns 100 4.49 4.87 -0.38
Grouping GroupBy 5 columns 200 6.21 7.43 -1.22
Grouping GroupBy 5 columns 300 6.94 8.55 -1.62
Grouping GroupBy 5 columns 500 10.03 8.79 1.24
Ranking Ranking by Group 10 4.14 6.20 -2.06
Ranking Ranking by Group 100 17.25 23.41 -6.15
Ranking Ranking by Group 200 15.94 19.02 -3.08
Ranking Ranking by Group 300 12.91 18.37 -5.46
Ranking Ranking by Group 500 13.00 16.05 -3.05

## No Significant Elements

Aggregate Operations - Setup: Local VM

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Grouping GroupBy 1 column 10 0.79 1.34 -0.55
Grouping GroupBy 1 column 100 2.87 5.44 -2.57
Grouping GroupBy 1 column 200 3.45 5.89 -2.44
Grouping GroupBy 1 column 300 2.37 5.85 -3.48
Grouping GroupBy 1 column 500 3.48 5.39 -1.90
Grouping GroupBy 10 columns 10 0.52 0.77 -0.26
Grouping GroupBy 10 columns 100 1.46 2.65 -1.19
Grouping GroupBy 10 columns 200 1.66 3.43 -1.77
Grouping GroupBy 10 columns 300 1.24 2.94 -1.70
Grouping GroupBy 10 columns 500 1.40 2.73 -1.33
Grouping GroupBy 5 columns 10 0.57 0.76 -0.19
Grouping GroupBy 5 columns 100 1.93 3.06 -1.13
Grouping GroupBy 5 columns 200 2.35 3.97 -1.61
Grouping GroupBy 5 columns 300 1.83 4.06 -2.23
Grouping GroupBy 5 columns 500 2.38 3.81 -1.43
Ranking Ranking by Group 10 3.55 4.54 -0.98
Ranking Ranking by Group 100 10.16 14.68 -4.52
Ranking Ranking by Group 200 8.14 14.22 -6.07
Ranking Ranking by Group 300 7.21 12.12 -4.91
Ranking Ranking by Group 500 5.86 13.34 -7.48

Aggregate Operations - Setup: Cluster

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Grouping GroupBy 1 column 10 3.44 2.99 0.44
Grouping GroupBy 1 column 100 17.88 17.61 0.27
Grouping GroupBy 1 column 200 24.25 22.46 1.79
Grouping GroupBy 1 column 300 27.39 26.73 0.66
Grouping GroupBy 1 column 500 30.10 29.13 0.97
Grouping GroupBy 10 columns 10 2.15 2.14 0.01
Grouping GroupBy 10 columns 100 6.98 9.69 -2.70
Grouping GroupBy 10 columns 200 8.50 13.25 -4.75
Grouping GroupBy 10 columns 300 9.10 15.03 -5.93
Grouping GroupBy 10 columns 500 6.19 8.08 -1.88
Grouping GroupBy 5 columns 10 2.23 2.29 -0.07
Grouping GroupBy 5 columns 100 9.68 11.32 -1.64
Grouping GroupBy 5 columns 200 13.55 15.82 -2.27
Grouping GroupBy 5 columns 300 15.89 18.90 -3.01
Grouping GroupBy 5 columns 500 19.24 21.91 -2.67
Ranking Ranking by Group 10 13.56 13.58 -0.02
Ranking Ranking by Group 100 28.46 44.82 -16.35
Ranking Ranking by Group 200 28.93 46.44 -17.51
Ranking Ranking by Group 300 29.15 47.86 -18.71
Ranking Ranking by Group 500 29.29 42.18 -12.88

Comparing Mixed operations

Scala is faster than PySpark

Mixed Operations - All Environments:

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Pivots Pivot 1 Rows and 1 Column 10 4.42 4.87 -0.44
Pivots Pivot 1 Rows and 1 Column 100 33.09 31.88 1.21
Pivots Pivot 1 Rows and 1 Column 200 47.27 46.44 0.83
Pivots Pivot 1 Rows and 1 Column 300 47.36 49.58 -2.22
Pivots Pivot 1 Rows and 1 Column 500 59.54 55.97 3.57
Pivots Pivot 10 Rows and 1 Column 10 2.15 2.57 -0.42
Pivots Pivot 10 Rows and 1 Column 100 11.83 12.59 -0.76
Pivots Pivot 10 Rows and 1 Column 200 17.40 19.51 -2.10
Pivots Pivot 10 Rows and 1 Column 300 18.29 23.15 -4.86
Pivots Pivot 10 Rows and 1 Column 500 23.10 29.14 -6.04
Pivots Pivot 5 Rows and 1 Column 10 2.37 2.84 -0.47
Pivots Pivot 5 Rows and 1 Column 100 13.30 13.71 -0.40
Pivots Pivot 5 Rows and 1 Column 200 21.63 22.50 -0.87
Pivots Pivot 5 Rows and 1 Column 300 24.56 29.58 -5.02
Pivots Pivot 5 Rows and 1 Column 500 32.85 33.67 -0.82

## No Significant Elements

Mixed Operations - Setup: Databricks

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Pivots Pivot 1 Rows and 1 Column 10 2.57 3.06 -0.49
Pivots Pivot 1 Rows and 1 Column 100 29.73 24.38 5.36
Pivots Pivot 1 Rows and 1 Column 200 45.50 41.06 4.44
Pivots Pivot 1 Rows and 1 Column 300 42.56 46.00 -3.43
Pivots Pivot 1 Rows and 1 Column 500 52.25 48.85 3.40
Pivots Pivot 10 Rows and 1 Column 10 1.41 1.95 -0.54
Pivots Pivot 10 Rows and 1 Column 100 10.25 9.22 1.02
Pivots Pivot 10 Rows and 1 Column 200 15.68 13.90 1.78
Pivots Pivot 10 Rows and 1 Column 300 16.40 18.99 -2.60
Pivots Pivot 10 Rows and 1 Column 500 21.18 22.55 -1.38
Pivots Pivot 5 Rows and 1 Column 10 1.55 2.16 -0.60
Pivots Pivot 5 Rows and 1 Column 100 11.28 10.00 1.28
Pivots Pivot 5 Rows and 1 Column 200 17.79 16.44 1.35
Pivots Pivot 5 Rows and 1 Column 300 20.73 22.13 -1.41
Pivots Pivot 5 Rows and 1 Column 500 29.20 25.17 4.03

## No Significant Elements

Mixed Operations - Setup: Local VM

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Pivots Pivot 1 Rows and 1 Column 10 3.64 4.81 -1.17
Pivots Pivot 1 Rows and 1 Column 100 23.76 27.31 -3.55
Pivots Pivot 1 Rows and 1 Column 200 32.26 33.57 -1.31
Pivots Pivot 1 Rows and 1 Column 300 24.67 34.97 -10.31
Pivots Pivot 1 Rows and 1 Column 500 39.40 41.93 -2.53
Pivots Pivot 10 Rows and 1 Column 10 1.33 1.93 -0.60
Pivots Pivot 10 Rows and 1 Column 100 6.97 9.01 -2.04
Pivots Pivot 10 Rows and 1 Column 200 10.20 14.85 -4.64
Pivots Pivot 10 Rows and 1 Column 300 8.60 12.78 -4.18
Pivots Pivot 10 Rows and 1 Column 500 12.36 19.47 -7.11
Pivots Pivot 5 Rows and 1 Column 10 1.53 2.14 -0.61
Pivots Pivot 5 Rows and 1 Column 100 7.73 10.19 -2.47
Pivots Pivot 5 Rows and 1 Column 200 13.40 15.88 -2.48
Pivots Pivot 5 Rows and 1 Column 300 11.82 21.80 -9.98
Pivots Pivot 5 Rows and 1 Column 500 16.65 23.62 -6.97

Mixed Operations - Setup: Cluster

Category Operation Dataset PySpark (sec) Scala (sec) PySpark - Scala
Pivots Pivot 1 Rows and 1 Column 10 9.69 8.59 1.10
Pivots Pivot 1 Rows and 1 Column 100 58.46 56.04 2.42
Pivots Pivot 1 Rows and 1 Column 200 80.83 82.92 -2.10
Pivots Pivot 1 Rows and 1 Column 300 102.34 85.94 16.39
Pivots Pivot 1 Rows and 1 Column 500 114.39 98.28 16.11
Pivots Pivot 10 Rows and 1 Column 10 5.28 5.09 0.19
Pivots Pivot 10 Rows and 1 Column 100 24.73 26.48 -1.75
Pivots Pivot 10 Rows and 1 Column 200 35.26 40.03 -4.77
Pivots Pivot 10 Rows and 1 Column 300 41.43 52.20 -10.77
Pivots Pivot 10 Rows and 1 Column 500 48.41 61.63 -13.22
Pivots Pivot 5 Rows and 1 Column 10 5.68 5.61 0.06
Pivots Pivot 5 Rows and 1 Column 100 28.50 28.15 0.35
Pivots Pivot 5 Rows and 1 Column 200 45.74 47.83 -2.09
Pivots Pivot 5 Rows and 1 Column 300 56.95 60.01 -3.06
Pivots Pivot 5 Rows and 1 Column 500 72.58 70.78 1.79

## No Significant Elements

Comparing Runs operations

slight improvement with additonal runs (not statistically significant)

filtered=data_raw 
filtered$Operation = forcats::fct_reorder(filtered$RunID,filtered[[targetVar]],.fun=median,desc=F)
## Warning: Some components of ... were not used: desc
p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) + 
  geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape)  +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits)) 
ggplot_colors(p)
## Warning: Removed 910 rows containing non-finite values (stat_boxplot).

By Setup

p=ggplot(data=filtered, aes_string(x='RunID',y=targetVar,fill='Language',color='Language')) + 
  geom_boxplot(outlier.size = 0.5,size=0.5,alpha=0.6,outlier.shape = outliersShape)  +
  scale_y_continuous(limits = c(0, quantile(filtered[[targetVar]],probs=outliersQuantile))
                     ,labels=scales::comma_format(accuracy=1,suffix=targetUnits))  +
  facet_wrap(~Setup,ncol=1)
ggplot_colors(p)
## Warning: Removed 910 rows containing non-finite values (stat_boxplot).